{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9fadb9cd-fe0b-49c8-b62d-b6ae656d543c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from warnings import simplefilter\n",
    "from sklearn.exceptions import ConvergenceWarning\n",
    "simplefilter(\"ignore\", category=ConvergenceWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12dcda70-7854-4dd3-b044-9978a0f63c33",
   "metadata": {},
   "source": [
    "## TASK1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2b0cb4c6-ba3e-4be5-b6a0-c3fe821b3cfd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('./基于论文摘要的文本分类与关键词抽取挑战赛公开数据/train.csv')\n",
    "train['title'] = train['title'].fillna('')\n",
    "train['abstract'] = train['abstract'].fillna('')\n",
    "\n",
    "test = pd.read_csv('./基于论文摘要的文本分类与关键词抽取挑战赛公开数据/test.csv')\n",
    "test['title'] = test['title'].fillna('')\n",
    "test['abstract'] = test['abstract'].fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "829bfa2f-4e38-40bf-a2c7-127c0fbf96e5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uuid</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>abstract</th>\n",
       "      <th>Keywords</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Accessible Visual Artworks for Blind and Visua...</td>\n",
       "      <td>Quero, Luis Cavazos; Bartolome, Jorge Iranzo; ...</td>\n",
       "      <td>Despite the use of tactile graphics and audio ...</td>\n",
       "      <td>accessibility technology; multimodal interacti...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Seizure Detection and Prediction by Parallel M...</td>\n",
       "      <td>Li, Chenqi; Lammie, Corey; Dong, Xuening; Amir...</td>\n",
       "      <td>During the past two decades, epileptic seizure...</td>\n",
       "      <td>CNN; Seizure Detection; Seizure Prediction; EE...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Fast ScanNet: Fast and Dense Analysis of Multi...</td>\n",
       "      <td>Lin, Huangjing; Chen, Hao; Graham, Simon; Dou,...</td>\n",
       "      <td>Lymph node metastasis is one of the most impor...</td>\n",
       "      <td>Histopathology image analysis; computational p...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Long-Term Effectiveness of Antiretroviral Ther...</td>\n",
       "      <td>Huang, Peng; Tan, Jingguang; Ma, Wenzhe; Zheng...</td>\n",
       "      <td>In order to assess the effectiveness of the Ch...</td>\n",
       "      <td>HIV; ART; mortality; observational cohort stud...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Real-Time Facial Affective Computing on Mobile...</td>\n",
       "      <td>Guo, Yuanyuan; Xia, Yifan; Wang, Jing; Yu, Hui...</td>\n",
       "      <td>Convolutional Neural Networks (CNNs) have beco...</td>\n",
       "      <td>facial affective computing; convolutional neur...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uuid                                              title  \\\n",
       "0     0  Accessible Visual Artworks for Blind and Visua...   \n",
       "1     1  Seizure Detection and Prediction by Parallel M...   \n",
       "2     2  Fast ScanNet: Fast and Dense Analysis of Multi...   \n",
       "3     3  Long-Term Effectiveness of Antiretroviral Ther...   \n",
       "4     4  Real-Time Facial Affective Computing on Mobile...   \n",
       "\n",
       "                                              author  \\\n",
       "0  Quero, Luis Cavazos; Bartolome, Jorge Iranzo; ...   \n",
       "1  Li, Chenqi; Lammie, Corey; Dong, Xuening; Amir...   \n",
       "2  Lin, Huangjing; Chen, Hao; Graham, Simon; Dou,...   \n",
       "3  Huang, Peng; Tan, Jingguang; Ma, Wenzhe; Zheng...   \n",
       "4  Guo, Yuanyuan; Xia, Yifan; Wang, Jing; Yu, Hui...   \n",
       "\n",
       "                                            abstract  \\\n",
       "0  Despite the use of tactile graphics and audio ...   \n",
       "1  During the past two decades, epileptic seizure...   \n",
       "2  Lymph node metastasis is one of the most impor...   \n",
       "3  In order to assess the effectiveness of the Ch...   \n",
       "4  Convolutional Neural Networks (CNNs) have beco...   \n",
       "\n",
       "                                            Keywords  label  \n",
       "0  accessibility technology; multimodal interacti...      0  \n",
       "1  CNN; Seizure Detection; Seizure Prediction; EE...      1  \n",
       "2  Histopathology image analysis; computational p...      1  \n",
       "3  HIV; ART; mortality; observational cohort stud...      0  \n",
       "4  facial affective computing; convolutional neur...      0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a4bc6c89-002e-4941-bfb9-ca7406c1ae9e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train['text'] = train['title'].fillna('') + ' ' +  train['author'].fillna('') + ' ' + train['abstract'].fillna('')+ ' ' + train['Keywords'].fillna('')\n",
    "test['text'] = test['title'].fillna('') + ' ' +  test['author'].fillna('') + ' ' + test['abstract'].fillna('')+ ' ' + train['Keywords'].fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "371f23ec-03c9-4003-8666-479359bfe6d2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "vector = CountVectorizer().fit(train['text'])\n",
    "train_vector = vector.transform(train['text'])\n",
    "test_vector = vector.transform(test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b4b555b6-373d-43ae-99c8-51b85aec385a",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0    0.98030   0.98571   0.98300      3079\n",
      "           1    0.98485   0.97912   0.98197      2921\n",
      "\n",
      "    accuracy                        0.98250      6000\n",
      "   macro avg    0.98257   0.98241   0.98249      6000\n",
      "weighted avg    0.98251   0.98250   0.98250      6000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_pred = cross_val_predict(\n",
    "    LogisticRegression(),\n",
    "    train_vector,\n",
    "    train['label']\n",
    ")\n",
    "print(classification_report(train['label'], train_pred, digits=5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fd736331-1b2b-42e4-aa4c-fd6f2b8436e5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model = LogisticRegression()\n",
    "model.fit(train_vector, train['label'])\n",
    "test['label'] = model.predict(test_vector)\n",
    "test[['uuid', 'Keywords', 'label']].to_csv('submit.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ce75c4d-915e-4f0b-a142-b16567fda726",
   "metadata": {},
   "source": [
    "## TASK2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2b493756-886a-4a08-81ed-b33db69a1e72",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize, ngrams\n",
    "stops = [\n",
    "    'will', 'can', \"couldn't\", 'same', 'own', \"needn't\", 'between', \"shan't\", 'very',\n",
    "     'so', 'over', 'in', 'have', 'the', 's', 'didn', 'few', 'should', 'of', 'that', \n",
    "     'don', 'weren', 'into', \"mustn't\", 'other', 'from', \"she's\", 'hasn', \"you're\",\n",
    "     'ain', 'ours', 'them', 'he', 'hers', 'up', 'below', 'won', 'out', 'through',\n",
    "     'than', 'this', 'who', \"you've\", 'on', 'how', 'more', 'being', 'any', 'no',\n",
    "     'mightn', 'for', 'again', 'nor', 'there', 'him', 'was', 'y', 'too', 'now',\n",
    "     'whom', 'an', 've', 'or', 'itself', 'is', 'all', \"hasn't\", 'been', 'themselves',\n",
    "     'wouldn', 'its', 'had', \"should've\", 'it', \"you'll\", 'are', 'be', 'when', \"hadn't\",\n",
    "     \"that'll\", 'what', 'while', 'above', 'such', 'we', 't', 'my', 'd', 'i', 'me',\n",
    "     'at', 'after', 'am', 'against', 'further', 'just', 'isn', 'haven', 'down',\n",
    "     \"isn't\", \"wouldn't\", 'some', \"didn't\", 'ourselves', 'their', 'theirs', 'both',\n",
    "     're', 'her', 'ma', 'before', \"don't\", 'having', 'where', 'shouldn', 'under',\n",
    "     'if', 'as', 'myself', 'needn', 'these', 'you', 'with', 'yourself', 'those',\n",
    "     'each', 'herself', 'off', 'to', 'not', 'm', \"it's\", 'does', \"weren't\", \"aren't\",\n",
    "     'were', 'aren', 'by', 'doesn', 'himself', 'wasn', \"you'd\", 'once', 'because', 'yours',\n",
    "     'has', \"mightn't\", 'they', 'll', \"haven't\", 'but', 'couldn', 'a', 'do', 'hadn',\n",
    "     \"doesn't\", 'your', 'she', 'yourselves', 'o', 'our', 'here', 'and', 'his', 'most',\n",
    "     'about', 'shan', \"wasn't\", 'then', 'only', 'mustn', 'doing', 'during', 'why',\n",
    "     \"won't\", 'until', 'did', \"shouldn't\", 'which'\n",
    "]\n",
    "\n",
    "def extract_keywords_by_freq(title, abstract):\n",
    "    ngrams_count = list(ngrams(word_tokenize(title.lower()), 2)) + list(ngrams(word_tokenize(abstract.lower()), 2))\n",
    "    ngrams_count = pd.DataFrame(ngrams_count)\n",
    "    ngrams_count = ngrams_count[~ngrams_count[0].isin(stops)]\n",
    "    ngrams_count = ngrams_count[~ngrams_count[1].isin(stops)]\n",
    "    ngrams_count = ngrams_count[ngrams_count[0].apply(len) > 3]\n",
    "    ngrams_count = ngrams_count[ngrams_count[1].apply(len) > 3]\n",
    "    ngrams_count['phrase'] = ngrams_count[0] + ' ' + ngrams_count[1]\n",
    "    ngrams_count = ngrams_count['phrase'].value_counts()\n",
    "    ngrams_count = ngrams_count[ngrams_count > 1]\n",
    "    return list(ngrams_count.index)[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f3e68211-586e-4fd1-9485-be713bda8df0",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['visual artworks',\n",
       " 'visually impaired',\n",
       " 'impaired people',\n",
       " 'multimodal approach',\n",
       " 'tactile graphics']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extract_keywords_by_freq(train['title'].iloc[0], train['abstract'].iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3ae85e9a-bbe8-475b-88ec-cff2923eec35",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_acc = []\n",
    "for row in train.iterrows():\n",
    "    prediction_keywords = extract_keywords_by_freq(row[1].title, row[1].abstract)\n",
    "    keywords = row[1].Keywords.lower().split('; ')\n",
    "    acc = len(set(prediction_keywords) & set(keywords)) / len(keywords)\n",
    "    train_acc.append(acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "030b054c-925a-4b85-8407-051ab79bc10a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.10382675141265374"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(train_acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c9427262-2b1f-45e8-963e-a591042af2d3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_words = []\n",
    "for row in test.iterrows():\n",
    "    prediction_keywords = extract_keywords_by_freq(row[1].title, row[1].abstract)\n",
    "    prediction_keywords = [x.title() for x in prediction_keywords]\n",
    "    if len(prediction_keywords) == 0:\n",
    "        prediction_keywords = ['A', 'B']\n",
    "    test_words.append('; '.join(prediction_keywords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b4e3386a-a7a1-47fa-b99a-9a52dd4bb99a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "test['Keywords'] = test_words\n",
    "test[['uuid', 'Keywords', 'label']].to_csv('submit.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0627479-60ba-45ef-9902-e22abab982df",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
